# Import Pandas
import pandas as pd
# Import numpy
import numpy as np
# Import matplotlib
import matplotlib.pyplot as plt
# Import plotly
import plotly.express as px
import plotly.graph_objects as go
# Import statsmodels
from statsmodels.formula.api import ols
import statsmodels.api as sm
# Import dataset
df = pd.read_csv('C:/Users/rgens/Datasets/Salary_dataset.csv')
# Head dataset
df.head()
| Unnamed: 0 | YearsExperience | Salary | |
|---|---|---|---|
| 0 | 0 | 1.2 | 39344.0 |
| 1 | 1 | 1.4 | 46206.0 |
| 2 | 2 | 1.6 | 37732.0 |
| 3 | 3 | 2.1 | 43526.0 |
| 4 | 4 | 2.3 | 39892.0 |
# Get rid of unused column
df2 = df.drop('Unnamed: 0', axis=1)
# Head dataset again
df2.head()
| YearsExperience | Salary | |
|---|---|---|
| 0 | 1.2 | 39344.0 |
| 1 | 1.4 | 46206.0 |
| 2 | 1.6 | 37732.0 |
| 3 | 2.1 | 43526.0 |
| 4 | 2.3 | 39892.0 |
# Responsive boxplot for X variable
# Creates boxplot
fig = px.box(df2['YearsExperience'], points = 'all')
# Creates titles and adjust size
fig.update_layout(title = f'Distribution of X', title_x = 0.5, yaxis_title = 'Years of Experience', width = 800, height = 500)
# Shows plot
fig.show()
# Responsive boxplot for Y variable
fig = px.box(df2['Salary'], points = 'all')
fig.update_layout(title = f'Distribution of Y', title_x = 0.5, yaxis_title = 'Salary in USD', width = 800, height = 500)
fig.show()
# Scatterplot of YearsExperience vs Salary
# Creates scatterplot
fig = px.scatter(x = df2['YearsExperience'], y = df2['Salary'])
# Creates titles
fig.update_layout(title = 'YearsExperience vs Salary', title_x= 0.5, xaxis_title = 'Years of Experience',
yaxis_title = 'Salary in USD', height = 500, width = 700)
# Creates black lines around plot
fig.update_xaxes(showline = True, linewidth = 2, linecolor = 'black', mirror = True)
fig.update_yaxes(showline = True, linewidth = 2, linecolor = 'black', mirror = True)
# Shows plot
fig.show()
# Simple Linear Regression Model
model = ols('Salary ~ YearsExperience', data = df2).fit()
# Parameters
model.params
Intercept 24848.203967 YearsExperience 9449.962321 dtype: float64
# Summary of model
model.summary()
| Dep. Variable: | Salary | R-squared: | 0.957 |
|---|---|---|---|
| Model: | OLS | Adj. R-squared: | 0.955 |
| Method: | Least Squares | F-statistic: | 622.5 |
| Date: | Tue, 07 Mar 2023 | Prob (F-statistic): | 1.14e-20 |
| Time: | 20:45:31 | Log-Likelihood: | -301.44 |
| No. Observations: | 30 | AIC: | 606.9 |
| Df Residuals: | 28 | BIC: | 609.7 |
| Df Model: | 1 | ||
| Covariance Type: | nonrobust |
| coef | std err | t | P>|t| | [0.025 | 0.975] | |
|---|---|---|---|---|---|---|
| Intercept | 2.485e+04 | 2306.654 | 10.772 | 0.000 | 2.01e+04 | 2.96e+04 |
| YearsExperience | 9449.9623 | 378.755 | 24.950 | 0.000 | 8674.119 | 1.02e+04 |
| Omnibus: | 2.140 | Durbin-Watson: | 1.648 |
|---|---|---|---|
| Prob(Omnibus): | 0.343 | Jarque-Bera (JB): | 1.569 |
| Skew: | 0.363 | Prob(JB): | 0.456 |
| Kurtosis: | 2.147 | Cond. No. | 13.6 |
# Create Residual Plots and Q-Q plot
fig = plt.figure(figsize = (12,8))
fig = sm.graphics.plot_regress_exog(model, 'YearsExperience', fig=fig)
sm.qqplot(model.resid, fit=True,line='45')
plt.show()
eval_env: 1
# Let's make some predictions
# Get values to predict off of
X = pd.DataFrame({'YearsExperience': np.arange(0,12)})
# Predict values
predict = model.predict(X)
# See the predicted values
print(predict)
0 24848.203967 1 34298.166288 2 43748.128609 3 53198.090931 4 62648.053252 5 72098.015574 6 81547.977895 7 90997.940217 8 100447.902538 9 109897.864860 10 119347.827181 11 128797.789503 dtype: float64
# Let's take a look at our model on a scatterplot
fig = go.Figure()
# Adds trend lines where train will be red and prediction will be blue
fig.add_trace(go.Scatter(x=df2['YearsExperience'], y=df2['Salary'], name='train', mode='markers', marker_color='rgba(152, 0, 0, .8)'))
fig.add_trace(go.Scatter(x=X.iloc[:,0].tolist(), y=predict, name='prediction', mode='lines+markers', marker_color='rgba(0, 0, 152, .8)'))
# Adds titles
fig.update_layout(title = f'YearsExperience vs Salary W/ Model',title_x=0.5, xaxis_title= "Years of Experience", yaxis_title="Salary in USD")
# Shows lines around plot
fig.update_xaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.update_yaxes(showline=True, linewidth=2, linecolor='black', mirror=True)
fig.show()